{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from tensorflow.keras.models import Sequential\n",
        "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences"
      ],
      "metadata": {
        "id": "q9n8UiMR74n3"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "data = pd.read_csv('spam_ham_dataset.csv')\n",
        "X = data['text']\n",
        "y = data['label']\n",
        "y = y.map({'ham': 0, 'spam': 1})"
      ],
      "metadata": {
        "id": "mzWr2hfq773m"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)"
      ],
      "metadata": {
        "id": "DUbwo5FV7-LB"
      },
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer = Tokenizer()\n",
        "tokenizer.fit_on_texts(X_tr)\n",
        "X_tr_seq = tokenizer.texts_to_sequences(X_tr)\n",
        "X_te_seq = tokenizer.texts_to_sequences(X_te)\n",
        "max_seq_len = 100\n",
        "X_tr_pad = pad_sequences(X_tr_seq, maxlen=max_seq_len, padding='post')\n",
        "X_te_pad = pad_sequences(X_te_seq, maxlen=max_seq_len, padding='post')"
      ],
      "metadata": {
        "id": "Em0mEiBR8BPb"
      },
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "vocab_size = len(tokenizer.word_index) + 1\n",
        "model = Sequential()\n",
        "model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len))\n",
        "model.add(LSTM(64, return_sequences=True))\n",
        "model.add(Dropout(0.5))\n",
        "model.add(LSTM(64))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(X_tr_pad, y_tr, epochs=10, batch_size=32, validation_split=0.2)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "s_M5u0tM8Fkb",
        "outputId": "93d7c7f8-40b7-44be-b425-8f57dcc18190"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/10\n",
            "104/104 [==============================] - 35s 297ms/step - loss: 0.5534 - accuracy: 0.7482 - val_loss: 0.5267 - val_accuracy: 0.7705\n",
            "Epoch 2/10\n",
            "104/104 [==============================] - 28s 272ms/step - loss: 0.5402 - accuracy: 0.7582 - val_loss: 0.5110 - val_accuracy: 0.7742\n",
            "Epoch 3/10\n",
            "104/104 [==============================] - 29s 277ms/step - loss: 0.4195 - accuracy: 0.8210 - val_loss: 0.2600 - val_accuracy: 0.9118\n",
            "Epoch 4/10\n",
            "104/104 [==============================] - 28s 271ms/step - loss: 0.2672 - accuracy: 0.8951 - val_loss: 0.1177 - val_accuracy: 0.9614\n",
            "Epoch 5/10\n",
            "104/104 [==============================] - 28s 268ms/step - loss: 0.0643 - accuracy: 0.9797 - val_loss: 0.1393 - val_accuracy: 0.9650\n",
            "Epoch 6/10\n",
            "104/104 [==============================] - 28s 268ms/step - loss: 0.0444 - accuracy: 0.9879 - val_loss: 0.1399 - val_accuracy: 0.9710\n",
            "Epoch 7/10\n",
            "104/104 [==============================] - 29s 280ms/step - loss: 0.0451 - accuracy: 0.9912 - val_loss: 0.1501 - val_accuracy: 0.9674\n",
            "Epoch 8/10\n",
            "104/104 [==============================] - 28s 268ms/step - loss: 0.0311 - accuracy: 0.9946 - val_loss: 0.1582 - val_accuracy: 0.9686\n",
            "Epoch 9/10\n",
            "104/104 [==============================] - 29s 279ms/step - loss: 0.0275 - accuracy: 0.9952 - val_loss: 0.1492 - val_accuracy: 0.9710\n",
            "Epoch 10/10\n",
            "104/104 [==============================] - 29s 283ms/step - loss: 0.0249 - accuracy: 0.9955 - val_loss: 0.1553 - val_accuracy: 0.9698\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x78bd259d00a0>"
            ]
          },
          "metadata": {},
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model.summary()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "95K3lpbr8SkX",
        "outputId": "72b34217-6840-4f9e-f435-64c8861c5222"
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Model: \"sequential_2\"\n",
            "_________________________________________________________________\n",
            " Layer (type)                Output Shape              Param #   \n",
            "=================================================================\n",
            " embedding_2 (Embedding)     (None, 100, 128)          6625664   \n",
            "                                                                 \n",
            " lstm_4 (LSTM)               (None, 100, 64)           49408     \n",
            "                                                                 \n",
            " dropout_2 (Dropout)         (None, 100, 64)           0         \n",
            "                                                                 \n",
            " lstm_5 (LSTM)               (None, 64)                33024     \n",
            "                                                                 \n",
            " dense_2 (Dense)             (None, 1)                 65        \n",
            "                                                                 \n",
            "=================================================================\n",
            "Total params: 6,708,161\n",
            "Trainable params: 6,708,161\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JjiVnmmi5Mm6",
        "outputId": "dcd0456d-87ea-454d-a363-1ad8193eb132"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "33/33 [==============================] - 1s 42ms/step - loss: 0.1547 - accuracy: 0.9700\n",
            "Test Loss: 0.1547\n",
            "Test Accuracy: 0.9700\n"
          ]
        }
      ],
      "source": [
        "loss, acc = model.evaluate(X_te_pad, y_te)\n",
        "print(f\"Test Loss: {loss:.4f}\")\n",
        "print(f\"Test Accuracy: {acc:.4f}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "email_text = input(\"Enter an email text: \")\n",
        "\n",
        "sequence = tokenizer.texts_to_sequences([email_text])\n",
        "padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')\n",
        "prediction = model.predict(padded_sequence)\n",
        "\n",
        "if prediction > 0.5:\n",
        "    print(\"Prediction: Spam\")\n",
        "else:\n",
        "    print(\"Prediction: Ham\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WvGAXfeW7f3h",
        "outputId": "084ba5c2-4e54-47af-d239-b83a6485b511"
      },
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Enter an email text: you won 1 mill\n",
            "1/1 [==============================] - 1s 855ms/step\n",
            "Prediction: Spam\n"
          ]
        }
      ]
    }
  ]
}